** BPP code for The Effects of Changes in Local Bank Health on Household Consumption
** by Daniel Cooper and Joe Peek, ReStat 2020
/* ////////////////////////////////////////////////////////////////////////// */

/* This code executes the CEX portion of the 2-step procedure needed to generate imputed 
nondurable consumption based on the approach of Blundell, Pistaferi, and Preston (2006) [BPP] 
as described in the readme file entilted "Readme_BPP". In particular, the code sets up the 
CEX data and runs the regressions and saves the relevant coefficients (betas) need to generate 
BPP consumption using data from the PSID. BPPs original analysis only goes through 1992.  
This code extends the approach through 2017. There have been a few modifications (noted inline below) 
to account for changes in the CEX structure after BPP was written.*/

/* The code is divided into three blocks for ease of execution.  

--The first block "gensetup" reads in the CEX dataset generated from the
raw microdata files and performs a number of transformation of the data to get the survey/household 
structure to line up as close as possible with the structure in the PSID.  In particular, 
in the PSID the household head (survey reference person) is nearly always a male unless a 
male is not present in the household. In the CEX, there is a more equal split between 
male/female reference persons (heads). 

--The second block "regdat" uses the transformed data from the "gensetup" block 
and sets up the relevant variables and interaction terms used in the imputaton equation/regression

--The third block "BPPreg" regresses log food consumption on log nondurable consumption 
and a series of household-level controls that are common across the PSID and CEX. 
It then saves the coefficients from this regression to use with the actual PSID data to infer
households' nondurable expenditures in that dataset.*/

local gensetup =1
local regdat =1
local BPPreg = 1


* update the line below with the relevant path for running your analysis
cd ~/cex/imputation_dhc/update2017/

if `gensetup' {
use merge_jan20.dta, clear

*****************
* Preliminaries *
*****************
#delimit;

* Unit ID (newid) naming convention changes over time so make it consistent;
replace newid = newid1 if newid ==. & newid1 ~=.;

*drop extra variables from the dataset that are not needed; 
drop ptref check save hwealth stkbnd costflg intmo mktval_sec ret_sec chgval_sec status_sec numvehq 
savacct_cr savrel1yrago ckacct_cr ckrel1yrago savebnds_cr svbndsel1yrago secrel1yrago secpurch secsales 
busi12moinv busiwdrawass busiwdrawgs svbndrel1yrago iyr rmo ;

***************************************;
* Align CEX and PSID Reference persons*;
***************************************;
* Ultimately we want to track data for the head and spouse (if any) to use in the regressions;
* define a few work-related variables befoe adjusting the reference person.;

gen hours_p1 = inc_hrs1 if relatref ==1;
gen hours_p2 = inc_hrs2 if relatref ==2;

drop inc_hrs1 inc_hrs2;

gen weeks_p1 = wkswkd_ref if relatref ==1;
gen weeks_p2 = wkswkd_sp  if relatref ==2;

drop wkswkd_ref wkswkd_sp;


* ADJUST REFERENCE PERSON AS NEEDED;
* In CEX "head" is person who owns/rents unit where the family lives. In PSID ref person is always the male in the household (if there is one). ;
 
gen hd=relatref==1;
gen wf=relatref==2;

gen couple=hd+wf;

/**********
*(id is quarterly id for consumer unit (newid with interview number appended at end) so ccouple =2 max.;
*everything is done at the quarterly frequency for now before being collapsed to an annual frequency later.;
*************/

bysort id: egen ccouple=sum(couple);

*keep only those consumer units with one or two adults;
keep if ccouple==2|ccouple==1;

* there is an issue with same sex couples starting in 2000s being double counted because
they are both male therefore you cannot do the relation switch as defined above...can figure
out who these folks are based on summing "sexm" by interview.;

gen male2 = 1 if relatref ==1 & sexm ==1;
replace male2 = 1 if relatref ==2 & sexm ==1;

bysort newid intnum:  egen ssexcouple = sum(male2);

* two males has ssexcouple ==2;

*recode relations in accordance with the PSID taking into account same sex couples;
***married and males are heads, married and females are wives***;

bysort id: gen relation1=1 if ccouple==2&sexm==1 & ssexcouple ~=2;
bysort id: replace relation1=2 if ccouple==2&sexm==2;
bysort id: replace relation1=relatref if ccouple==1 | ssexcouple ==2 ;

*drop relatref;
rename relation1 relation;

*setup "head" and "spouse" data based on the transformed relationship variable;

gen sexh = sexm if relation ==1;
gen ageh = agem if relation ==1;
drop sex_ref sexm ;

gen race = race_ref if relation ==1;

*since "head" shifting need to be careful with hours and education;

gen hd_hours = hours_p1 if hours_p1 ~=. & relation ==1;
replace hd_hours = hours_p2 if hd_hours ==. & relation ==1;

gen wf_hours = hours_p1 if hours_p1 ~=. & relation ==2;
replace wf_hours = hours_p2 if hd_hours ==. & relation ==2;

gen hd_weeks = weeks_p1 if weeks_p1 ~=. & relation ==1;
replace hd_weeks = weeks_p2 if hd_weeks ==. & relation ==1;

gen wf_weeks = weeks_p1 if weeks_p1 ~=. & relation ==2;
replace wf_weeks = weeks_p2 if hd_weeks ==. & relation ==2;


gen educh = educ1 if educ1 ~=. & relation ==1;
replace educh = educ2 if educh ==. & relation ==1;

gen educw = educ1 if educ1 ~=. & relation ==2;
replace educw = educ2 if educw ==. & relation ==2;

drop hours_p1 hours_p2 educ1 educ2 weeks_p1 weeks_p2;

* Fill in "wifes" data for head's observation for ease of analysis later.; 

*wife's salary;
capture drop salarys;
gen salarys = salaryx if relation==2;
bysort id: egen msalaryw=max(salarys);
bysort id: replace salarys=msalaryw if salarys==.;

*wife's hours;
gen wf_hours1 = wf_hours if relation==2;
bysort id: egen mwfh=max(wf_hours1);
bysort id: replace wf_hours1=mwfh if wf_hours1==.;
drop wf_hours mwfh;
rename wf_hours1 wf_hours;

* wife's weeks worked;
gen wf_weeks1 = wf_weeks if relation==2;
bysort id: egen mwfh=max(wf_weeks1);
bysort id: replace wf_weeks1=mwfh if wf_weeks1==.;
drop wf_weeks;
rename wf_weeks1 wf_weeks;

*wife's age;
capture drop ages;
gen ages = agem if relation==2;
bysort id: egen mages=max(ages);
bysort id: replace ages=mages if ages==.;
drop agem;

*wife's education;
*capture drop educs;
gen educs = educw if relation==2;
bysort id: egen meducs=max(educs);
bysort id: replace educs=meducs if educs==.;
drop educw;

* Generate expenditure quarters based on the month assigned to a households' expenditures for a given survey.
Expenditures in CEX are reported for a given month but they ultimately cover the three months prior to the households interview month
"emo" is defined as one month prior to the the interview month or the final month of the expenditure period.;

gen qyr=eyr;
gen eqtr=emo;
recode eqtr (1 2 3 = 11) (4 5 6 = 22) (7 8 9 = 33) (10 11 12 = 44) ;
recode eqtr (11=1) (22=2) (33=3) (44=4) ;

*age groups*;
gen agex=ageh;
recode agex (16 17 18 19 = 1) (20 21 22 23 24 = 2) (25 26 27 28 29 = 3) (30 31 32 33 34 = 4) (35 36 37 38 39 = 5) (40 41 42 43 44  = 6) 
(45 46 47 48 49 = 7) (50 51 52 53 54 = 8) (55 56 57 58 59 = 9) (60 61 62 63 64 = 10) (65 66 67 68 69 = 11) (70 71 72 73 74 = 12) 
(75 76 77 78 79  = 13) (80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 = 14);

label values agex ages;

label define ages 1 "16-19" 2 "20-24" 3 "25-29" 4 "30-34" 5 "35-39" 6 "40-44" 7 "45-49" 8 "50-54" 9 "55-59" 10 "60-64" 11 "65-69" 12 "70-74" 13 "75-79" 14 "81+", replace;

********************;
* Refine the sample;
********************;
* only keep "heads" (relation =1) like in the PSId and those units with a complete set of income information;
* Through 2013 the CEX includes a flag for whether the income information from a given unit is complete;
* After 2014 they began imputing missing values for income so this variable was dropped.  BPP use the variable
* to refine their sample so we use it here through the 2013 expenditure year;

keep if relation ==1;

keep if eyr > 2013 | (inccomp ==1 & eyr <=2013);

* there are some duplicate observations even after these refinements that need to be dropped; 
duplicates drop;

* since we are going to use annual data (summ across households quarterly expenditures keep only households
* that are in the in the CEX for all four interviews note that the interview numbering switches from 2--5 to 1--4 in 2015
* in recent years so keep/drop households accordingly ; 


by newid, sort: egen allint=total(intnum);
bys newid: egen minint = min(intnum);

keep if allint==14 | (allint ==10 & minint ==1);

#delimit cr

* following BPP drop single headed households and those with a change in family size or marital status
drop if fam_size==1
by newid, sort: egen minfam=min(fam_size)
by newid, sort: egen maxfam=max(fam_size)
by newid, sort: egen maxmar=max(marital)
by newid, sort: egen minmar=min(marital)

drop if minfam~=maxfam
drop if minmar~=maxmar

* setup hourly wage/salary data
gen hrlyh=salaryx/(hd_hours*hd_weeks)
gen hrlys=salarys/(wf_hours*wf_weeks)

gen salarytot=salaryx+salarys
replace salarytot=salaryx if salarytot==.
replace salarytot=salarys if salarytot==.

* DETERMINE THE REFERENCE YEAR 
* for the annual expenditure data since households interviews can span years.

*we assume if the household is interviewed for at least 6 months at t+1, then the reference year is t+1 and it is t otherwise
*this has to be done last otherwise end up with some people with 3 records rather than 4
*also take into account interview numbering change in 2015.

gen cyrx = eyr -1 if intnum==5 & emo<=5 & minint ==2
replace cyrx = eyr if intnum ==5 & emo >5 & minint ==2
replace cyrx = eyr if intnum ==4 & emo >5 & minint ==1
replace cyrx = eyr-1 if intnum ==4 & emo <=5 & minint ==1

by newid, sort: egen cyr = min(cyrx)
drop cyrx

* setup a few additional series that will be used in the regresion analysis. 
gen yrborn=eyr-ageh
gen yrbornw=eyr-ages

gen owner =1 if tenure <=3
replace owner = 0 if tenure ==4 | tenure ==5

*drop extra variables

#delimit;
drop emo tcws tcrp tcdr tcir tcgp tcwelf tcothi tcppt tcot tcnont tcalimr sumtc  eqtr allint minfam maxfam maxmar
minmar hhid membno relatref occup id ws bp fp rp dr ir gp ss ssi uc wc welf othi fs dgr drr dss ft slt ppt ot nont 
dpp dor alimr fam_gt64 urbn popsize inmsa numearn eyr qyr totexppq ;

#delimit cr

save cexclean_stg1_jan2020, replace
}

********************;
* Transform and clean the data as needed for the regressions;
********************;
if `regdat' {
use  cexclean_stg1_jan2020, clear

sort newid intnum
drop if newid==.
drop intnum

* collapse the expenditure data to annualize it (sum)
* income is already annualized and BPP use the last observation (last interview)
* other variables are taken as of the first or last interview and we take the minimum value for age and other demographic 
* variables that could change slightly over the course of a households interview. The change in income is between the first and last interviews 

#delimit ;
collapse fam_size cyr finalwgt (sum) food  foodhome foodout health healthhosp healthdoc healthdrug healthins perscare tobacco
hous housmort housins houstax housutil housrep housfurn trans transloan transdown transleas pers_serv transtrip
transins transgas transrep veh vehpark vehbus vehtaxi vehoth educsch educoth educchild clothing 
vacat recr educ_expn housrent  (first) dpi12mof = dpi12mo yrborn
(min) marital state region educh educs ageh sex ages  race_ref fam_lt18   
(last) dpi12mo salarytot salaryx salarys hrlyh hrlys inccomp owner, by(newid);

* generate BPPs measure of nondurable expenditures;
* the components are aggregated fromt he detailed expenditure files when reading in the raw data based on BPPs component definitions;
* additional information on cleaning and aggregating the raw data is available from the authors upon request.;
gen nondexp=  food+housutil+veh+transgas+transleas + transrep +clothing+ tob + perscare + pers_serv + housrent;
gen lnondexp = log(nondexp);

gen lnfood=ln(food);

*drop households following BPPs critera (zero before tax income, income less than money spent on food, 
missing food record, zero nondurable expenditures, missing region, missing education);

drop if food==.;
drop if dpi12mo ==0;
drop if dpi12mo <food;
drop if nondexp ==0;
drop if region==.;
drop if educh==.;
*drop if inccomp ~=1;

* BPP also restrict the sample based on age and year born.;

drop if ageh<30; 
drop if ageh>65;

* BPP include a maximum for year born as well but we drop it because the age restriction takes care of it 
* and their sample ends much earlier than ours;
drop if yrborn<1920;

#delimit cr

*****PRICES********
* BPP control for select prices (CPIs) in their regression so here we merge in the relevatn BEA price data
* The 4 CPI-U prices are: Transporation, Food, Alcoholic Beverages, and Motor Fuel.

* The data can be found in the "pricescex2020" dataset or can be downloaded from the BLS website:
* https://www.bls.gov/cpi/data.htm. 

* The database includes the level and log level of each CPI-U price index. Data have been converted from 
* monthly to quarterly (average) before posting to the database.  

gen year = cyr

merge m:1 year using pricescex, keepusing(lnpricefood lnpricealc lnpricefuel lnpricetrans) keep(1 3) nogen

drop year

*****setup for regressions********

* dummies for various demographics

gen agesq=ageh^2

gen ed0=(educh==0)
gen ed1=(educh==1)
gen ed2=(educh==2) 

gen edw1=(educs==1)
gen edw2=(educs==2)
gen edw3=(educs==3 | educs==4) 

gen reg1=(region==1)
gen reg2=(region==2)
gen reg3=(region==3)

gen white=(race_ref==1)

*birth cohorts
gen     coh=1 if (yrborn>=1920&yrborn<1925)
replace coh=2 if (yrborn>=1925&yrborn<1930)
replace coh=3 if (yrborn>=1930&yrborn<1935)
replace coh=4 if (yrborn>=1935&yrborn<1940)
replace coh=5 if (yrborn>=1940&yrborn<1945)
replace coh=6 if (yrborn>=1945&yrborn<1950)
replace coh=7 if (yrborn>=1950&yrborn<1955)
replace coh=8 if (yrborn>=1955&yrborn<1960)
replace coh=9 if (yrborn>=1960&yrborn<1965)
replace coh=10 if (yrborn>=1965&yrborn<1970)
replace coh=11 if (yrborn>=1970&yrborn<1975)
replace coh=12 if (yrborn>=1975&yrborn<1980)     
replace coh=13 if (yrborn>=1980&yrborn<1985)
replace coh=13 if (yrborn>=1980&yrborn<1987)
*stops in 1987 given sample restrictions

*cohort dummies
qui tab coh, gen(cohd)

* generate year dummies  (through 2017)
tabulate cyr, generate(y)

forvalues i = 1/38 {
	local j = `i' + 1979
	rename y`i' yr`j'
}

* dummies for number of kids
* NOTE:  With this command 0 [no kids] gets coded as "child1"
tab fam_lt18, gen(child)
gen ch3ormore = (child4==1 | child5==1 | child6==1 | child7==1 | child8==1 | child9==1 | child10==1 | child11 ==1 | child12 ==1)

drop child4-child11
rename child1 child0
rename child2 child1
rename child3 child2
rename ch3ormore child3

summ child0-child3

drop child12

label var child1 "One child"
label var child2 "Two Children"
label var child3 "3+ Children"

*************Interaction Terms***************

*this all follows BPPs setup

*consumption x year interactions*

forvalues i = 1980/2017 {
	local j = `i'
	gen lnc_x_`j'=lnondexp*yr`i'
}

*consumption x # children interactions*
forvalues i = 1/3 {
	local j = `i'
	gen lnc_x_ch`j'=lnondexp*child`i'
}

*consumption X education interactions*
forvalues i = 1/2 {
	local j = `i'
	gen lnc_x_ed`j'=lnondexp*ed`i'
}

*** calculated instruments ****

* BPP take an IV approach in their prediction equation using head and spouse (implied) wage data as instruments (for nondurable expenditures)

* these are (calculated) hourly wages
gen lnhrlyh=ln(hrlyh)
gen lnhrlys=ln(hrlys)
recode lnhrlyh .=0
recode lnhrlys .=0

gen male= (sexh==1)

egen lwhd=mean(lnhrlyh), by(coh educh cyr male)
egen lwwf=mean(lnhrlys) if male==1, by(coh educs cyr)

replace lwwf = 0 if lwwf ==.

* instruments are also interacted with year, eduaction and number of kids
local i=1980 
while `i'<=2017{
gen lwhd`i'=lwhd*yr`i'
gen lwwf`i'=lwwf*yr`i'
*gen lwwf_alt`i'=lwwf_alt*yr`i'
local i=`i'+1
}

* EDUCATION INTERACTIONS
 
*head
  gen lwhd_ed0=lwhd*ed0
  gen lwhd_ed1=lwhd*ed1
  gen lwhd_ed2=lwhd*ed2

*wife
  gen lwwf_ed0=lwwf*ed0
  gen lwwf_ed1=lwwf*ed1
  gen lwwf_ed2=lwwf*ed2

****Number of kids interactions

*children interactions head*
  gen lwhd_ch1=lwhd*child1
  gen lwhd_ch2=lwhd*child2
  gen lwhd_ch3=lwhd*child3

*children interactions wife*
  gen lwwf_ch1=lwwf*child1
  gen lwwf_ch2=lwwf*child2
  gen lwwf_ch3=lwwf*child3

save cexclean_full, replace

}

/* REGRESSIONS */ 
if `BPPreg' {
* RUN the BPP imputation regression and save the relevant coefficients;
use cexclean_full, clear

#delimit;

* regress log food consumption on log nondurable consumption and series of household characteristics and price controls
(coefficients are inverted for the imputation in the PSID following BPP);

* this version run through 2015 because that is the time horizon for which we have/use PSID data. ;
eststo: ivreg lnfood (lnondexp lnc_x_1981-lnc_x_2015 lnc_x_ed* lnc_x_ch* = lwhd lwhd1981-lwhd2015 lwwf lwhd_ed1 lwhd_ed2   lwwf_ed1 lwwf_ed2 lwhd_ch1-lwhd_ch3 lwwf_ch1-lwwf_ch3) 
child1-child3 ed1 ed2 lnpricefood lnpricefuel lnpricealc lnpricetrans white fam_size cohd2-cohd13 ageh agesq reg1-reg3;

* save the coefficients;
#delimit cr

gen bi0_lc =_b[lnondexp]                     
gen bi0_lced2  =_b[lnc_x_ed1]
gen bi0_lced3  =_b[lnc_x_ed2]

gen bi0_lch1  =_b[lnc_x_ch1]
gen bi0_lch2  =_b[lnc_x_ch2]
gen bi0_lch3  =_b[lnc_x_ch3]

local i=1981
while `i' <=2015{
gen bi0_lc`i'=_b[lnc_x_`i']
local i=`i'+1               
}

gen bi0_age    =_b[ageh]
gen bi0_age2   =_b[agesq]

gen bi0_lpf     =_b[lnpricefood]
gen bi0_lpfuel  =_b[lnpricefuel]
gen bi0_lpalc   =_b[lnpricealc]
gen bi0_lptr    =_b[lnpricetrans]

gen bi0_edu1   =_b[ed1]
gen bi0_edu2   =_b[ed2] 
gen bi0_cohd2  =_b[cohd2] 
gen bi0_cohd3  =_b[cohd3] 
gen bi0_cohd4  =_b[cohd4] 
gen bi0_cohd5  =_b[cohd5] 
gen bi0_cohd6  =_b[cohd6] 
gen bi0_cohd7  =_b[cohd7] 
gen bi0_cohd8  =_b[cohd8] 
gen bi0_cohd9  =_b[cohd9] 
gen bi0_cohd10  =_b[cohd10] 
gen bi0_cohd11  =_b[cohd11] 
gen bi0_cohd12  =_b[cohd12] 
gen bi0_cohd13  =_b[cohd13] 

gen bi0_child1   =_b[child1]
gen bi0_child2   =_b[child2] 
gen bi0_child3   =_b[child3] 

gen bi0_famsize  =_b[fam_size] 
gen bi0_race=_b[white]
gen bi0_reg1=_b[reg1]
gen bi0_reg2=_b[reg2]
gen bi0_reg3=_b[reg3]
gen bi0_cons=_b[_cons]

rename cyr year

sort year

by year:  egen meanf    = mean(lnfood) 
by year:  egen meannds  = mean(lnondexp)


*collapse the data for ease of merging with the PSID
*value of the coefficients are the same across year except where there are year-specific effects
preserve
keep bi0_* year meanf meannds
collapse bi0_* meanf meannds, by(year)
sort year
save imputebetas_bpp,replace
restore
}
